## Authors: Alexander Herzog and Kenneth Benoit
## Date: May 30, 2015
## Replication file for JOP article "The Most Unkindest Cuts: Speaker Selection and Expressed Government Dissent During Economic Crisis"

rm(list = ls(all = TRUE))
require(quanteda)

# read in and process speech data
d <- read.delim("Data_budget_debates_1983-2012.tab", stringsAsFactors=FALSE) 
names(d)[names(d)=="year"] <- "debate_year"  

# put into a corpus
budgetCorpus <- corpus(as.character(d[, "speech"]), 
                       source="budget_debates_1983-2012.tab from an SQL dump",
                       notes="From Alex Herzog's database of Irish speech data.")
# assign document vars
docvars(budgetCorpus) <- d[, which(names(d)!="speech")]

# rules for budget_year
debate_year <- d$debate_year
month <- d$month
budget_year <- rep(NA, length(debate_year))
# 1983-1996: budget debates are held at the beginning of the year
budget_year[debate_year>=1983 & debate_year<=1996] <- debate_year[debate_year>=1983 & debate_year<=1996]
# 1997: there is one budget debate in January/February 1997 for the 1997 budget, and one in December 1997 for the 1998 budget
budget_year[debate_year==1997 & month<=2] <- 1997
budget_year[debate_year==1997 & month==12] <- 1998
# 1998-2008 and 2010-2012: budget debates are held in December for the next year
budget_year[(debate_year>=1998 & debate_year<=2008) | (debate_year>=2010 & debate_year<=2012)] <- debate_year[(debate_year>=1998 & debate_year<=2008) | (debate_year>=2010 & debate_year<=2012)] + 1
# 2009: there is one debate in April for the supplementary budget and one in Dec for the 2010 budget
budget_year[debate_year==2009 & month==4] <- 0000
budget_year[debate_year==2009 & month==12] <- 2010
table(budget_year, debate_year, useNA="ifany")
# add to corpus
docvars(budgetCorpus, "budget_year") <- budget_year
rm(debate_year, budget_year, month)


### FUNCTIONS ##########################
# clean-up member name 
clean.name <- function(x) {
    mname <- as.character(x)
    mname <- sub("RIP","",mname, ignore.case=T)
    mname <- gsub("\\(.+?\\)","",mname)
    mname <- gsub("^.{2}\\. ", "" , mname)
    mname <- sub("Ó ","Ó",mname, ignore.case=T)
    mname <- gsub("^Mr ","",mname, ignore.case=T)    
    mname <- gsub("^Ms ","",mname, ignore.case=T)
    mname <- gsub("\\s{2,}", " ", mname, perl=TRUE)    
    return(mname)
}

# retrieve last name
get.lname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "), function(y) y[length(y)])
    return(lnames)
}

# retrieve first name
get.fname <- function(x) {
    mname <- clean.name(x)
    lnames <- sapply(strsplit(mname, " "),function(y) y[1])
    return(lnames)
}


## flag speakers who were LEAS-CHEANN COMHAIRLE
docvars(budgetCorpus, "leas_cheann_comhairle") <- 
    (docvars(budgetCorpus, "member_name")=="Mr. John J. Ryan" & docvars(budgetCorpus, "budget_year") %in% 1983:1986) |
    (docvars(budgetCorpus, "member_name")=="Mr. Jim Tunney" & docvars(budgetCorpus, "budget_year") %in% 1987:1992) |
    (docvars(budgetCorpus, "member_name")=="Mr. Joe Jacob" & docvars(budgetCorpus, "budget_year") %in% 1993:1997) |
    (docvars(budgetCorpus, "member_name")=="Dr. Rory O'Hanlon" & docvars(budgetCorpus, "budget_year") %in% 1998:2002) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Pattison" & docvars(budgetCorpus, "budget_year") %in% 2003:2007) |
    (docvars(budgetCorpus, "member_name")=="Mr. Brendan Howlin" & docvars(budgetCorpus, "budget_year") %in% 2008:2011) |
    (docvars(budgetCorpus, "member_name")=="Mr. Michael P. Kitt" & docvars(budgetCorpus, "budget_year") %in% 2012:2013) 
# table(docvars(subset(budgetCorpus, leas_cheann_comhairle==TRUE), "member_name"), docvars(subset(budgetCorpus, leas_cheann_comhairle==TRUE), "budget_year"))

## flag speakers who were CHEANN COMHAIRLE
docvars(budgetCorpus, "cheann_comhairle") <- 
    (docvars(budgetCorpus, "member_name")=="Mr. Thomas J. (Cavan) Fitzpatrick" & docvars(budgetCorpus, "budget_year") %in% 1983:1986) |
    (docvars(budgetCorpus, "member_name")=="Mr. Seán Treacy" & docvars(budgetCorpus, "budget_year") %in% 1987:1997) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Pattison" & docvars(budgetCorpus, "budget_year") %in% 1998:2002) |
    (docvars(budgetCorpus, "member_name")=="Dr. Rory O'Hanlon" & docvars(budgetCorpus, "budget_year") %in% 2003:2007) |
    (docvars(budgetCorpus, "member_name")=="Mr. John O'Donoghue" & docvars(budgetCorpus, "budget_year") %in% 2008:2009) |
    (docvars(budgetCorpus, "member_name")=="Mr. Séamus Kirk" & docvars(budgetCorpus, "budget_year") %in% 2009:2011) |
    (docvars(budgetCorpus, "member_name")=="Mr. Sean Barrett" & docvars(budgetCorpus, "budget_year") %in% 2012:2013) 
table(docvars(subset(budgetCorpus, cheann_comhairle==TRUE), "member_name"), docvars(subset(budgetCorpus, cheann_comhairle==TRUE), "budget_year"))


# clean up some names and extract first and last names
docvars(budgetCorpus, "member_name") <- clean.name(docvars(budgetCorpus, "member_name"))
docvars(budgetCorpus, "last_name") <- get.lname(docvars(budgetCorpus, "member_name"))
docvars(budgetCorpus, "first_name") <- get.fname(docvars(budgetCorpus, "member_name"))



## external data to add

# read finance ministers 
fm.data <- read.csv("Data_finance_ministers_1983-2013.csv", header=TRUE)
docvars(budgetCorpus, "finance_minister") <- (docvars(budgetCorpus, "memberID") * 10000 +  docvars(budgetCorpus, "budget_year")) %in%
                                             (fm.data$memberID * 10000 + fm.data$year)
table(docvars(subset(budgetCorpus, finance_minister==TRUE), "member_name"), docvars(subset(budgetCorpus, finance_minister==TRUE), "budget_year"))


# read opposition spokespersons (shadow finance minister)
os.data <- read.csv("Data_opposition_spokesperson_1983-2013.csv",header=TRUE)
docvars(budgetCorpus, "opposition_spokesperson") <- (docvars(budgetCorpus, "memberID") * 10000 +  docvars(budgetCorpus, "budget_year")) %in%
    (os.data$memberID * 10000 + os.data$year)
table(docvars(subset(budgetCorpus, opposition_spokesperson==TRUE), "member_name"), docvars(subset(budgetCorpus, opposition_spokesperson==TRUE), "budget_year"))


################################################
## create year-speaker dataset to hold results
################################################

varstokeep <- grep("memberID|const|name|party|budget_year|cheann|finance|opposition|position|department",
                   names(docvars(budgetCorpus)))
yearspeakerData <- unique(docvars(budgetCorpus)[, varstokeep])

# identify the duplicated departments
inMoreThanOneDept <- which(duplicated(yearspeakerData[, c("memberID", "budget_year")]))
dupls <- yearspeakerData[,
                         c("memberID", "budget_year", "department")]
# split into a list
duplslist <- split(dupls, dupls$memberID*10000 + dupls$budget_year)
multidept <- do.call(rbind, lapply(duplslist, function(l) l[1,]))
# make it "wide" into dept2 dept3 dept4 columns
names(multidept)[which(names(multidept)=="department")] <- "department2"
multidept$department3 <- do.call(rbind, lapply(duplslist, function(l) l[2,]))$department
multidept$department4 <- do.call(rbind, lapply(duplslist, function(l) l[3,]))$department

# remove the duplicated entries
yearspeakerData <- yearspeakerData[-inMoreThanOneDept, ]
# merge in the extra depts
yearspeakerData <- merge(yearspeakerData, multidept, 
                         by=c("memberID", "budget_year"), all.x=TRUE)

#################
## dfm analysis
#################

table(budgetCorpus$documents$budget_year)

cat("Computing wordscores: \n")
textResults <- list()
for (yr in 1983:2013) {
    cat("  ...", yr, "\n")
    thisdfm <- dfm(subset(budgetCorpus, budget_year==yr & 
                                        !leas_cheann_comhairle &
                                        !cheann_comhairle),
                   groups="memberID", stem=FALSE, verbose=FALSE)
    thisrefOpp <- 
        docvars(budgetCorpus, "memberID")[which(docvars(budgetCorpus, "budget_year")==yr &
                                                docvars(budgetCorpus, "opposition_spokesperson"))][1]
    thisrefGovt <- 
        docvars(budgetCorpus, "memberID")[which(docvars(budgetCorpus, "budget_year")==yr &
                                                    docvars(budgetCorpus, "finance_minister"))][1]
    minWordsThreshold <- 200
    thisdfm <- thisdfm[rowSums(thisdfm) > minWordsThreshold, ]
    refscores <- rep(NA, ndoc(thisdfm))
    refscores[which(docnames(thisdfm) == as.character(thisrefOpp))] <- -1 # opposition
    refscores[which(docnames(thisdfm) == as.character(thisrefGovt))] <- 1 # govt
    ws <- textmodel(thisdfm, refscores, smooth=1, model="wordscores")
    ts <- predict(ws, thisdfm, rescaling="mv", verbose=FALSE)
    textResults <- rbind(textResults, data.frame(budget_year = yr,
                                                 memberID = as.numeric(docnames(thisdfm)),
                                                 textscore = ts@textscores$textscore_mv))
}
cat("finished.\n")

# merge in the wordscores estimates
yearspeakerData <- merge(yearspeakerData, textResults, by=c("memberID", "budget_year"), all=TRUE)

save(yearspeakerData, file="./generated_data/yearspeakerData.RData")

# no wordscore but not leas_cc or cc
# View(yearspeakerData[is.na(yearspeakerData$textscore) & !yearspeakerData$cheann_comhairle & !yearspeakerData$leas_cheann_comhairle,] & !yearspeakerData$budget_year==0,])

# (texts(budgetCorpus)[which(docvars(budgetCorpus, "memberID")==1109 & docvars(budgetCorpus, "budget_year")==1984)])


## Add in speech information:
##  nspeeches:  How many recorded speech acts for this speaker, per budget_year
##  nwordstotal:  How many total words this speaker contributed for this budget_year
nspeeches <- aggregate(docvars(budgetCorpus)$speechID,
                       by=list(memberID=docvars(budgetCorpus)$memberID, 
                               budget_year=docvars(budgetCorpus)$budget_year),
                       length)
ntokens <- sapply(texts(budgetCorpus), function(x) length(tokenize(x, simplify=TRUE)))
ncharspeeches <- aggregate(ntokens,
                           by=list(memberID=docvars(budgetCorpus)$memberID, 
                                   budget_year=docvars(budgetCorpus)$budget_year),
                           sum)
temptextdata <- cbind(nspeeches, ncharspeeches[,3])
names(temptextdata)[3:4] <- c("nspeeches", "nwordstotal")
# merge in the speech data
yearspeakerData <- merge(yearspeakerData, 
                         temptextdata, by=c("memberID", "budget_year"), all=TRUE)

## recode leas_cheann_comhairle and cheann_comhairle into position
yearspeakerData$position[yearspeakerData$leas_cheann_comhairle] <- "Leas Cheann Comhairle"
yearspeakerData$position[yearspeakerData$cheann_comhairle] <- "Cheann Comhairle"
yearspeakerData$position_numeric[yearspeakerData$leas_cheann_comhairle] <- 4
yearspeakerData$position_numeric[yearspeakerData$cheann_comhairle] <- 5

# remove year=0000 which takes out the 2009 supplementary budget which we ignore
yearspeakerData <- yearspeakerData[which(yearspeakerData$budget_year!=0), ]

save(yearspeakerData, file="./generated_data/yearspeakerData.RData")


## Summary stats on speak numbers
summary(aggregate(textResults$memberID, by=list(textResults$budget_year), function(x) length(unique(x))))









